last changed: 2023-08-28
Load the rlog transformed DESeq data set and construct
the required data frames. The function assay returns the
values and colData returns the information about the
experimental design.
> load(file = "./../Clustering/Data/rlog_H3K27ac.rda")
>
> m.H3.ac <- assay(rlog.heat)
> paged_table(m.H3.ac %>% as.data.frame())
> info.H3.ac <- data.frame(colData(rlog.heat))
> paged_table(info.H3.ac)
Next the GRanges object can be constructed using the
rownames of the DESeq data set, which contain information
about the location of each peak.
> chr <- str_split_i(rownames(m.H3.ac), ":", 1)
> pos <- str_split_i(rownames(m.H3.ac), ":", 2)
> start <- as.integer(str_split_i(pos, "-", 1))
> end <- as.integer(str_split_i(rownames(m.H3.ac), "-", 2))
>
> peaks.H3ac <- GRanges(seqnames = chr,
+ ranges =IRanges(start = start, end = end))
>
> saveRDS(peaks.H3ac, "./Data/peaks_H3ac.rds")
> paged_table(peaks.H3ac %>% as.data.frame())
To visualize the distance to TSS the by cluster, a list
containing a entry for
each cluster with the corresponding peaks is required. Thus, the
clustering results are loaded and used to filter the
GRanges object.
> kmeans.k3 <- readRDS("./../Clustering/Data/kmeans.rds")
>
> clus.kmeans <- melt(kmeans.k3$cluster)
> clus.kmeans <- cbind(clus.kmeans, 1:nrow(clus.kmeans))
> colnames(clus.kmeans) <- c("cluster", "ind")
> clus.kmeans <- clus.kmeans[order(clus.kmeans[, "cluster"]), ]
> paged_table(clus.kmeans)
> ind.cluster1 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 1), "ind"]
> ind.cluster2 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 2), "ind"]
> ind.cluster3 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 3), "ind"]
>
> peaks.k1 <- peaks.H3ac[ind.cluster1, ]
> peaks.k2 <- peaks.H3ac[ind.cluster2, ]
> peaks.k3 <- peaks.H3ac[ind.cluster3, ]
>
> peaks <- list(cluster_1 = peaks.k1, cluster_2 = peaks.k2, cluster_3 = peaks.k3)
> head(peaks)
## $cluster_1
## GRanges object with 2799 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr14 64840548-64843743 *
## [2] chr14 64844047-64844598 *
## [3] chr1 116508159-116509891 *
## [4] chr2 58040368-58048461 *
## [5] chr5 37667044-37670097 *
## ... ... ... ...
## [2795] chr1 181133649-181136568 *
## [2796] chr8 127734251-127741103 *
## [2797] chr8 127742153-127742995 *
## [2798] chr11 8831934-8834702 *
## [2799] chr20 48846149-48847290 *
## -------
## seqinfo: 28 sequences from an unspecified genome; no seqlengths
##
## $cluster_2
## GRanges object with 1517 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr14 34718145-34719716 *
## [2] chr10 31099003-31099816 *
## [3] chr7 6445995-6449792 *
## [4] chr8 117520018-117521481 *
## [5] chr12 65946466-65950178 *
## ... ... ... ...
## [1513] chr8 142232614-142237657 *
## [1514] chrX 72269124-72270912 *
## [1515] chr7 73847739-73848606 *
## [1516] chr15 40306527-40309992 *
## [1517] chr5 89929848-89931827 *
## -------
## seqinfo: 28 sequences from an unspecified genome; no seqlengths
##
## $cluster_3
## GRanges object with 1984 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr14 64879344-64880265 *
## [2] chr20 43656766-43658787 *
## [3] chr20 43666580-43668593 *
## [4] chr13 51802460-51805452 *
## [5] chr10 31030898-31035191 *
## ... ... ... ...
## [1980] chr22 49825757-49828338 *
## [1981] chr8 56209657-56211303 *
## [1982] chr20 48826483-48829133 *
## [1983] chr14 73136349-73137361 *
## [1984] chr22 33919379-33921043 *
## -------
## seqinfo: 28 sequences from an unspecified genome; no seqlengths
The GRanges list can then be used to plot the average
distance to the TSS by cluster.
> hg38 <- TxDb.Hsapiens.UCSC.hg38.knownGene
> promoter <- getPromoters(TxDb = hg38, upstream=3000, downstream=3000)
> tagM.peaks <- lapply(peaks, getTagMatrix, windows=promoter)
## >> preparing start_site regions by gene... 2023-08-28 18:12:12
## >> preparing tag matrix... 2023-08-28 18:12:12
## >> preparing start_site regions by gene... 2023-08-28 18:12:47
## >> preparing tag matrix... 2023-08-28 18:12:47
## >> preparing start_site regions by gene... 2023-08-28 18:13:17
## >> preparing tag matrix... 2023-08-28 18:13:17
> plotAvgProf(tagM.peaks, TxDb = hg38, xlim=c(-3000, 3000),
+ xlab = "Genomic Region (5'->3')", ylab = "Peak Count") +
+ scale_color_manual(values = brewer.pal(n = 3, name = "Dark2"))
## >> plotting figure... 2023-08-28 18:13:48
Next, the peaks were annotated to genes using
annotatePeak and all known human genes. The resulting
object was then plotted with plotAnnoBar, to visualize the
distribution of features among peaks.
> anno.list <- lapply(peaks, annotatePeak, TxDb = hg38,
+ tssRegion=c(-3000, 3000), verbose=FALSE)
>
> plotAnnoBar(anno.list)
For the enrichment analysis the peaks were annotated to only coding genes within a range of -1 to 1 kb from the transcription start site (TSS). For this the peaks are annotated to using only the protein coding genes.
> TxDb.protein.coding <- loadDb( file="./Data/TxDb.protein.coding.sqlite")
> seqlevelsStyle(TxDb.protein.coding) <- "UCSC"
>
> peak.coding <- lapply(peaks, annotatePeak, TxDb = TxDb.protein.coding,
+ tssRegion = c(-3000, 3000), verbose=FALSE)
Next, the following function was used to translate the ENSEMBL identifiers to ENTREZID.
> # generate required structure:
> conv2table<-function(x){
+ eg = bitr(x@anno$geneId, fromType="ENSEMBL", toType="SYMBOL", OrgDb="org.Hs.eg.db")
+ ez = bitr(x@anno$geneId, fromType="ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
+ anno.table<-x@anno
+ #add gene symbols
+ mx<-match(anno.table$geneId, eg$ENSEMBL)
+ anno.table$gene_symbol<-eg[mx,"SYMBOL"]
+ #add gene entrezID
+ mx<-match(anno.table$geneId, ez$ENSEMBL)
+ anno.table$gene_entrez<-ez[mx,"ENTREZID"]
+
+ res.table<-cbind(
+ chr=as.vector(seqnames(anno.table)),
+ start=start(anno.table),
+ end=end(anno.table),
+ name=anno.table$name,
+ peak.p_adj=anno.table$qValue,
+ strand=rep(".", length(anno.table)),
+ genome_anno=anno.table$annotation,
+ gene_symbol=anno.table$gene_symbol,
+ gene_entrez=anno.table$gene_entrez,
+ distanceToTSS=anno.table$distanceToTSS,
+ transcriptId=anno.table$transcriptId,
+ geneId_ensembl=anno.table$geneId)
+ }
>
> peak.coding.ID <- lapply(peak.coding, function(n) conv2table(n))
The resulting list was then filtered for peaks which are within a range of -1 to 1 kb from the TSS.
> peak.coding.1kb <- lapply(peak.coding.ID, function(x)
+ filter(as.data.frame(x),
+ abs(as.integer(as.data.frame(x)$distanceToTSS)) <= 1000))
>
> saveRDS(peak.coding.1kb, file = "./Data/peaks_H3ac_cod1kb.rds")
> lapply(peak.coding.1kb, head)
## $cluster_1
## chr start end strand genome_anno gene_symbol gene_entrez
## 1 chr2 58040368 58048461 . Promoter (<=1kb) VRK2 7444
## 2 chr4 113977892 113980658 . Promoter (<=1kb) ARSJ 79642
## 3 chr12 103929052 103930016 . Promoter (<=1kb) HSP90B1 7184
## 4 chr12 103930337 103931619 . Promoter (<=1kb) HSP90B1 7184
## 5 chr4 159282053 159285899 . Promoter (<=1kb) RAPGEF2 9693
## 6 chr9 69375779 69378259 . Promoter (<=1kb) ENTREP1 9413
## distanceToTSS transcriptId geneId_ensembl
## 1 0 ENST00000412104 ENSG00000028116
## 2 0 ENST00000315366 ENSG00000180801
## 3 -91 ENST00000549334 ENSG00000166598
## 4 0 ENST00000680391 ENSG00000166598
## 5 0 ENST00000514565 ENSG00000109756
## 6 0 ENST00000645516 ENSG00000135063
##
## $cluster_2
## chr start end strand genome_anno gene_symbol gene_entrez
## 1 chr7 6445995 6449792 . Promoter (<=1kb) DAGLB 221955
## 2 chr8 117520018 117521481 . Promoter (<=1kb) MED30 90390
## 3 chr1 204213358 204214078 . Promoter (<=1kb) GOLT1A 127845
## 4 chr9 128455194 128457511 . Promoter (<=1kb) ODF2 4957
## 5 chr17 78167924 78169666 . Promoter (<=1kb) SYNGR2 9144
## 6 chr10 63266723 63270752 . Promoter (<=1kb) JMJD1C 221037
## distanceToTSS transcriptId geneId_ensembl
## 1 0 ENST00000297056 ENSG00000164535
## 2 0 ENST00000297347 ENSG00000164758
## 3 0 ENST00000308302 ENSG00000174567
## 4 0 ENST00000351030 ENSG00000136811
## 5 0 ENST00000225777 ENSG00000108639
## 6 0 ENST00000402544 ENSG00000171988
##
## $cluster_3
## chr start end strand genome_anno gene_symbol gene_entrez
## 1 chr14 64879344 64880265 . Promoter (<=1kb) SPTB 6710
## 2 chr20 43666580 43668593 . Promoter (<=1kb) MYBL2 4605
## 3 chr13 51802460 51805452 . Promoter (<=1kb) DHRS12 79758
## 4 chr10 31030898 31035191 . Promoter (<=1kb) ZNF438 220929
## 5 chr19 18519575 18522156 . Promoter (<=1kb) ELL 8178
## 6 chr17 30969849 30972583 . Promoter (<=1kb) RNF135 84282
## distanceToTSS transcriptId geneId_ensembl
## 1 0 ENST00000644917 ENSG00000070182
## 2 0 ENST00000396863 ENSG00000101057
## 3 0 ENST00000650833 ENSG00000102796
## 4 0 ENST00000436087 ENSG00000183621
## 5 0 ENST00000262809 ENSG00000105656
## 6 0 ENST00000443677 ENSG00000181481
> names(peak.coding.1kb) <- c("1", "2", "3")
> cod.genes.1kb <- lapply(peak.coding.1kb, function(i) as.data.frame(i)$gene_entrez)
Finally, the filtered list with the coding genes can be used for an
enrichment analysis using the function enrichPathway, which
can be plotted using dotplot.
> comp.Reac <- compareCluster(geneCluster = cod.genes.1kb,
+ fun = "enrichPathway",
+ pvalueCutoff = 0.05,
+ pAdjustMethod = "BH")
> paged_table(comp.Reac %>% as.data.frame())
> dotplot(comp.Reac, showCategory = 10,
+ title = "Reactome Pathway Enrichment Analysis (coding genes)")
Exactly the same as above can be done with the function
enrichGO.
> comp.GO <- compareCluster(geneCluster = cod.genes.1kb,
+ fun = "enrichGO",
+ OrgDb = org.Hs.eg.db,
+ ont = "BP",
+ readable = T,
+ pvalueCutoff = 0.05,
+ pAdjustMethod = "BH")
> paged_table(comp.GO %>% as.data.frame())
> dotplot(comp.GO, showCategory = 10,
+ title = "GO Enrichment Analysis (coding genes)")
The following versions of R and R packages were used to generate the report above:
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Berlin
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] reshape2_1.4.4
## [2] ggplot2_3.4.2
## [3] dplyr_1.1.2
## [4] RColorBrewer_1.1-3
## [5] ReactomePA_1.44.0
## [6] org.Hs.eg.db_3.17.0
## [7] clusterProfiler_4.8.1
## [8] TxDb.Hsapiens.UCSC.hg38.knownGene_3.17.0
## [9] GenomicFeatures_1.52.0
## [10] AnnotationDbi_1.62.1
## [11] DESeq2_1.40.1
## [12] SummarizedExperiment_1.30.2
## [13] Biobase_2.60.0
## [14] MatrixGenerics_1.12.2
## [15] matrixStats_1.0.0
## [16] GenomicRanges_1.52.0
## [17] GenomeInfoDb_1.36.0
## [18] IRanges_2.34.0
## [19] S4Vectors_0.38.1
## [20] BiocGenerics_0.46.0
## [21] ChIPseeker_1.36.0
## [22] stringr_1.5.0
## [23] report_0.5.7
## [24] rmarkdown_2.23
## [25] knitr_1.43
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.0
## [2] BiocIO_1.10.0
## [3] bitops_1.0-7
## [4] ggplotify_0.1.0
## [5] filelock_1.0.2
## [6] tibble_3.2.1
## [7] polyclip_1.10-4
## [8] graph_1.78.0
## [9] XML_3.99-0.14
## [10] lifecycle_1.0.3
## [11] lattice_0.21-8
## [12] MASS_7.3-60
## [13] insight_0.19.3
## [14] magrittr_2.0.3
## [15] sass_0.4.6
## [16] jquerylib_0.1.4
## [17] yaml_2.3.7
## [18] plotrix_3.8-2
## [19] cowplot_1.1.1
## [20] DBI_1.1.3
## [21] zlibbioc_1.46.0
## [22] purrr_1.0.1
## [23] ggraph_2.1.0
## [24] RCurl_1.98-1.12
## [25] yulab.utils_0.0.6
## [26] tweenr_2.0.2
## [27] rappdirs_0.3.3
## [28] GenomeInfoDbData_1.2.10
## [29] enrichplot_1.20.0
## [30] ggrepel_0.9.3
## [31] tidytree_0.4.2
## [32] reactome.db_1.84.0
## [33] codetools_0.2-19
## [34] DelayedArray_0.26.3
## [35] DOSE_3.26.1
## [36] xml2_1.3.4
## [37] ggforce_0.4.1
## [38] tidyselect_1.2.0
## [39] aplot_0.1.10
## [40] farver_2.1.1
## [41] viridis_0.6.3
## [42] BiocFileCache_2.8.0
## [43] GenomicAlignments_1.36.0
## [44] jsonlite_1.8.5
## [45] tidygraph_1.2.3
## [46] tools_4.3.0
## [47] progress_1.2.2
## [48] treeio_1.24.1
## [49] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [50] Rcpp_1.0.10
## [51] glue_1.6.2
## [52] gridExtra_2.3
## [53] xfun_0.39
## [54] qvalue_2.32.0
## [55] withr_2.5.0
## [56] fastmap_1.1.1
## [57] boot_1.3-28.1
## [58] fansi_1.0.4
## [59] caTools_1.18.2
## [60] digest_0.6.31
## [61] R6_2.5.1
## [62] gridGraphics_0.5-1
## [63] colorspace_2.1-0
## [64] GO.db_3.17.0
## [65] gtools_3.9.4
## [66] biomaRt_2.56.1
## [67] RSQLite_2.3.1
## [68] utf8_1.2.3
## [69] tidyr_1.3.0
## [70] generics_0.1.3
## [71] data.table_1.14.8
## [72] rtracklayer_1.60.0
## [73] prettyunits_1.1.1
## [74] graphlayouts_1.0.0
## [75] httr_1.4.6
## [76] S4Arrays_1.0.4
## [77] scatterpie_0.2.1
## [78] graphite_1.46.0
## [79] pkgconfig_2.0.3
## [80] gtable_0.3.3
## [81] blob_1.2.4
## [82] XVector_0.40.0
## [83] shadowtext_0.1.2
## [84] htmltools_0.5.5
## [85] fgsea_1.26.0
## [86] scales_1.2.1
## [87] png_0.1-8
## [88] ggfun_0.1.0
## [89] rstudioapi_0.14
## [90] rjson_0.2.21
## [91] nlme_3.1-162
## [92] curl_5.0.1
## [93] cachem_1.0.8
## [94] KernSmooth_2.23-21
## [95] parallel_4.3.0
## [96] HDO.db_0.99.1
## [97] restfulr_0.0.15
## [98] pillar_1.9.0
## [99] grid_4.3.0
## [100] vctrs_0.6.3
## [101] gplots_3.1.3
## [102] dbplyr_2.3.2
## [103] evaluate_0.21
## [104] cli_3.6.1
## [105] locfit_1.5-9.8
## [106] compiler_4.3.0
## [107] Rsamtools_2.16.0
## [108] rlang_1.1.1
## [109] crayon_1.5.2
## [110] labeling_0.4.2
## [111] plyr_1.8.8
## [112] stringi_1.7.12
## [113] viridisLite_0.4.2
## [114] BiocParallel_1.34.2
## [115] munsell_0.5.0
## [116] Biostrings_2.68.1
## [117] lazyeval_0.2.2
## [118] GOSemSim_2.26.0
## [119] Matrix_1.5-4.1
## [120] hms_1.1.3
## [121] patchwork_1.1.2
## [122] bit64_4.0.5
## [123] KEGGREST_1.40.0
## [124] highr_0.10
## [125] igraph_1.5.0
## [126] memoise_2.0.1
## [127] bslib_0.5.0
## [128] ggtree_3.8.0
## [129] fastmatch_1.1-3
## [130] bit_4.0.5
## [131] downloader_0.4
## [132] ape_5.7-1
## [133] gson_0.1.0